System Info
# session info
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.0 magrittr_1.5 tools_3.6.0 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.1 stringi_1.4.3 rmarkdown_1.12
## [9] knitr_1.22 stringr_1.4.0 xfun_0.6 digest_0.6.18
## [13] evaluate_0.13
Load Libraries
library(tibble)
library(magrittr)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
Load Data
# Read in all the data
# Rel Abund = (Sum of Contig count)/(QC sequence abundace)
MAGnum_df <- read.delim("input_matrix_v2.txt") %>%
tibble::column_to_rownames(var = "X")
MAGnum_mat <- MAGnum_df %>%
data.matrix()
MAGnum_count <- read.table("input_matrix_v1.counts.txt", sep="\t", header=TRUE)
rownames(MAGnum_count) <- MAGnum_count[,1]
MAGnum_count<-MAGnum_count[,-1]
head(MAGnum_count)
# read in the Bin stats
bin_stats <- read.table("bin_stats.txt", sep="\t", header=TRUE);
rownames(bin_stats)<-bin_stats[,1]
bin_stats<- bin_stats[,-1]
# Select the Top X abundance bins
abundant_mags <- MAGnum_mat[rowMeans(MAGnum_mat) > 0.0001, ]
# read in the Sample metadata
# Add a new column with color information for plotting
metadat <-
read.table("Sample_metadata.txt", sep = "\t", header = TRUE) %>%
mutate(colors = ifelse(Location == "Alaska", "cornflowerblue",
ifelse(Location == "Bodega Bay", "red",
ifelse(Location == "Japan North", "darkorchid",
ifelse(Location == "Japan South", "darkorchid1",
ifelse(Location == "Massachusetts", "deeppink",
ifelse(Location == "North Carolina", "darkslategray3",
ifelse(Location == "Quebec", "darkorange",
ifelse(Location == "San Diego", "green4",
ifelse(Location == "Washington", "ivory4",
ifelse(Location == "Norway", "firebrick",
ifelse(Location == "Sweden", "dodgerblue4",
ifelse(Location == "Wales", "gold2",
ifelse(Location == "Portugal", "forestgreen",
ifelse(Location == "French Med", "deepskyblue",
ifelse(Location == "Croatia", "black",
NA))))))))))))))))
# Set the colors of the different columns going in
colz <- metadat$colors
location_colors <- c(
Alaska = "cornflowerblue",
"Bodega Bay" = "red",
"Japan North" = "darkorchid",
"Japan South" = "darkorchid1",
Massachusetts = "deeppink",
"North Carolina" = "darkslategray3",
Quebec = "darkorange",
"San Diego" = "green4",
Washington = "ivory4",
Norway = "firebrick",
Sweden = "dodgerblue4",
Wales = "gold2",
Portugal = "forestgreen",
"French Med" = "deepskyblue",
Croatia = "black")
# Set the color palette
my_palette <- colorRampPalette(c("red","white ", "blue"))(n = 299)
Normalization
# Normalizing the matrix
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion
# Calculate the genome size
genome_sizes<- bin_stats$size * bin_stats$SCG_completeness
# Do the normalization
MAGnum_norm<-t(t(MAGnum_count) / genome_sizes)
Plot
Raw relative abundance data
# Without any scaling
heatmap.2(MAGnum_mat,
main = "NOT scaled Relative Abundance Data",
distfun = function(x) dist(x, method = "euclidean"),
hclustfun = function(x) hclust(x,method = "complete"),
#scale = "row",
col = my_palette,
trace = "none",
ColSideColors = colz,
key = TRUE, symkey = FALSE,
density.info = "none", srtCol = 30,
margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
key.xlab = "MAG Abundance",
lhei = c(1.5,9),
key.title=NA)
# Add legend
legend(y=0.75, x=0.05,
xpd=TRUE,
legend = unique(metadat$Location),
col = unique(metadat$colors),
lty= 1, lwd = 5, cex=.7)

Scaled relative abundance
heatmap.2(MAGnum_mat,
main="Relative abundance scaled",
distfun = function(x) dist(x, method = "euclidean"),
hclustfun = function(x) hclust(x,method = "complete"),
scale = "row",
col = my_palette,
trace = "none",
key = TRUE, symkey = FALSE,
density.info = "none", srtCol = 30,
margins=c(5.5,7), cexRow = 1.25,cexCol = 1.25,
key.xlab = "MAG Abundance",
lhei = c(1.5,9),
key.title=NA,
ColSideColors = colz)
# Add legend
legend(y=0.75, x=0.05,
xpd=TRUE,
legend = unique(metadat$Location),
col = unique(metadat$colors),
lty= 1, lwd = 5, cex=.7)

Scaled normalized data
Counts per Bin Per sample divided by genome size
# WITH scaling
heatmap.2(MAGnum_norm,
main = "Scaled Normalized Data",
distfun = function(x) dist(x, method = "euclidean"),
hclustfun = function(x) hclust(x,method = "complete"),
scale = "row",
col = my_palette,
trace = "none",
key = TRUE, symkey = FALSE,
density.info = "none", srtCol = 30,
margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
key.xlab = "MAG Abundance",
lhei = c(1.5,9),
key.title=NA,
ColSideColors = colz)
# Add legend
legend(y=0.75, x=0.05,
xpd=TRUE,
legend = unique(metadat$Location),
col = unique(metadat$colors),
lty= 1, lwd = 5, cex=.7)

Scaled Meanm Abundance MAGs > 0.0001
# WITH scaling
heatmap.2(abundant_mags,
main = "Scaled Mean Top Abundant MAGs",
distfun = function(x) dist(x, method = "euclidean"),
hclustfun = function(x) hclust(x,method = "complete"),
scale = "row",
col = my_palette,
trace = "none",
key = TRUE, symkey = FALSE,
density.info = "none", srtCol = 30,
margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
key.xlab = "MAG Abundance",
lhei = c(1.5,9),
key.title=NA,
ColSideColors = colz)
legend(y=0.75, x=0.05,
xpd=TRUE,
legend = unique(metadat$Location),
col = unique(metadat$colors),
lty= 1, lwd = 5, cex=.7)

Top MAG Abundance Plot
# Select the Top X abundance bins
top_mags <-
MAGnum_mat[rowMeans(MAGnum_mat) > 0.001, ]
# How many MAGs?
num_MAGs <- nrow(top_mags)
# Prepare dataframe for plotting
abund_df <- top_mags %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "MAG") %>%
rename(Sample = MAG) %>%
gather("MAG", "norm_abund", 2:(num_MAGs+1)) %>%
left_join(metadat, by = "Sample") %>%
dplyr::select(-colors)
## Warning: Column `Sample` joining character vector and factor, coercing into
## character vector
# What's the data frame look like?
datatable(abund_df, options = list(pageLength = 10))
# Plot each MAG by location
MAG_abund_plot <-
ggplot(abund_df,
aes(x = Location, y = norm_abund, color = Location, fill = Location)) +
geom_jitter() +
#geom_boxplot(alpha = 0.4, outlier.shape = NA) +
theme_minimal() +
scale_color_manual(values = location_colors) +
scale_fill_manual(values = location_colors) +
facet_wrap(~MAG, scales = "free") +
labs(y = "Normalized MAG Abundance") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))
# Interactive Plot
ggplotly(MAG_abund_plot) # Make the plot interactive